# importing libraries that I will be using.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import plotly.express as px
# reading in Border_Crossing_Entry_Data.csv and creating a dataframe.
border_crossing = pd.read_csv('Border_Crossing_Entry_Data.csv')
border_crossing
Let's start getting to know the data! This will be able to tell us what we're working with, what we will and will not need and if there will be any cleaning necessary. I do a dataset at a time.
border_crossing.info()
# Date column will need to be changed to datetime. Latitude, Longitude, and Point can be dropped as they will not be needed (we
# have port codes as well as the port name, state and border entry area).
# Finding the number of duplicates.
border_crossing.duplicated().sum()
# Finding the duplicates. Will need these later to drop them.
border_dupes = border_crossing.duplicated()
border_dupes2 = border_crossing[border_dupes]
border_dupes2
As we can see above, these are not actual duplicates. We will still include these in the dataset.
# Finding the sum of all null values in the dataset.
border_crossing.isnull().sum()
No need to worry about the nulls here as those columns will be dropped.
border_crossing['State'].unique()
border_crossing['Port Name'].unique()
border_crossing['Measure'].unique()
border_crossing.nsmallest(8, 'Value')
border_crossing.nlargest(8, 'Value')
# Checking dates. These will need to match up with the drug_overdose dataset. Rows will need to br dropped accordingly.
border_max = border_crossing['Date'].max()
border_min = border_crossing['Date'].min()
print(border_max)
print(border_min)
# reading in VSRR_Provisional_Drug_Overdose_Death_Counts.csv and creating a dataframe.
drug_overdose = pd.read_csv('VSRR_Provisional_Drug_Overdose_Death_Counts.csv')
drug_overdose
drug_overdose.sample(8)
drug_overdose.info()
# Checking for the years in this dataset. The border_crossing dataset goes back to 1997. If this doesn't go back as far, will
# isolate the years in the border_crossing set.
drug_overdose['Year'].unique()
# Finding null values.
drug_overdose.isnull().sum()
border_crossing:
drug_overdose:
# Change the date column to datetime.
border_crossing['Date'] = pd.to_datetime(border_crossing['Date'])
border_crossing.sample(5)
# Making sure the 'Date' column was changed to datetime.
print(border_crossing.dtypes)
# Dropping Port Code, Latitude, Longitude, and Point columns as they are not needed. Keeping inplace=True because I just want to modify
# the original dataframe and not create a new one.
border_crossing.drop(columns=['Port Code', 'Latitude', 'Longitude', 'Point'], inplace=True)
border_crossing.sample(5)
# Dropping all rows where year in 'Date' is below 2015.
border_crossing = border_crossing[border_crossing['Date'].dt.year >= 2015]
border_crossing.sample(5)
# Checking to make sure there are no years below 2015 left.
checking_years = border_crossing['Date'].dt.year.unique()
print(checking_years)
# Checking dates. These will need to match with the drug_overdose dataset. Will drop rows accordingly.
border_min = border_crossing['Date'].min()
border_max = border_crossing['Date'].max()
print(border_min)
print(border_max)
# Filtering and dropping any rows with dates above 2023-08-01 as the drug_overdose dataset only goes to 2023-08-01.
border_dates = border_crossing['Date'] <= '2023-08-01'
border_filtered = border_crossing[border_dates]
border_filtered.sample(5)
# Making sure the correct dates were dropped. Everything above 2023-08-01.
border_min2 = border_filtered['Date'].min()
border_max2 = border_filtered['Date'].max()
print(border_min2)
print(border_max2)
border_filtered['Measure'].unique()
border_filtered['State'].unique()
These are the ports I will be using going forward. Doing multiple ports in each state and multiple types of vehicles was not working.
# Filtering and finding the rows by 'Port Name' and 'Value'. This will sort by the highest value by port alphabetically.
high_try = border_filtered.loc[border_filtered.groupby('Port Name')['Value'].idxmax()].sort_values(by='State')
# I only want to see these columns.
high_try_columns = high_try[['Port Name', 'State', 'Value']]
# I need to see all of the rows. This just tells pandas to display all the rows here.
pd.set_option('display.max_rows', None)
# Print it.
print(high_try_columns)
Ports that are not listed here will be filtered out and dropped.
# Filtering border_filtered by the ports listed because it is faster to filter than drop over 100 different ports.
ports = ['Skagway', 'Nogales', 'San Ysidro', 'Eastport', 'Calais', 'Detroit', 'International Falls', 'Piegan', 'Santa Teresa',
'Buffalo Niagara Falls', 'Pembina', 'El Paso', 'Highgate Springs', 'Blaine']
border_final = border_filtered[border_filtered['Port Name'].isin(ports)]
# Checking to make sure the only ports are the ones listed above.
border_final['Port Name'].unique()
border_pivot = pd.pivot_table(border_final, values='Value', columns='Measure', index=['State', 'Date'], aggfunc="sum")
border_pivot.sample(5)
Originally when I used pivot_table() to reshape the dataframe to add all the vehicle columns, there ended up being many NaN values. I didn't understand why until I started locating the rows, specifically on 'State' and 'Date'. For some states, all vehicle types are recorded with associating values. However, there are some states that don't have certain vehicle types recorded at all.
I decided against imputing because I didn't want to add data with kNN that could influence analytics and models later. I simply decided to replace 'NaN' with pd.NA, this signifies that the data itself is actually missing after the pivot table adjustment.
Examples below.
# Example showing how Michigan has all vehicle types.
row_check = border_final.loc[(border_final['State'] == 'Michigan') & (border_final['Date'] == '2023-07-01')]
print(row_check)
# Example showing that California does not have all vehicle types.
row_check = border_final.loc[(border_final['State'] == 'California') & (border_final['Date'] == '2023-07-01')]
print(row_check)
border_pivot.replace(np.nan, pd.NA, inplace=True)
border_pivot.sample(5)
# Creating a new column, 'Vehicle Total', by adding all of the vehicle type values into one.
border_pivot['Vehicle Total'] = border_pivot.sum(axis=1)
border_pivot.sample(5)
border_pivot.drop(columns=['Bus Passengers', 'Buses', 'Pedestrians', 'Personal Vehicle Passengers', 'Personal Vehicles',
'Rail Containers Empty', 'Rail Containers Loaded', 'Train Passengers', 'Truck Containers Empty',
'Truck Containers Loaded', 'Trucks', 'Trains'], inplace=True)
border_pivot.sample(5)
border_pivot.info()
# Changing the data type to int64.
border_pivot['Vehicle Total'] = border_pivot['Vehicle Total'].astype('int64')
border_pivot.info()
# Dropping the 'State' column.
drug_overdose.drop(columns=['State'], inplace=True)
drug_overdose.sample(5)
# Renaming 'State Name' to 'State'.
drug_overdose = drug_overdose.rename(columns={'State Name':'State'})
drug_overdose.sample(5)
# Converting 'Year' column to an object.
drug_overdose['Year'] = drug_overdose['Year'].astype(str)
print(drug_overdose.dtypes)
drug_overdose.info()
# Combining the 'Year' and 'Month' with a hyphen.
drug_overdose['Date'] = drug_overdose['Year'].astype(str) + '-' + drug_overdose['Month'].astype(str)
drug_overdose.sample(5)
# Converting 'Date' over to datetime.
drug_overdose['Date'] = pd.to_datetime(drug_overdose['Date'])
drug_overdose.sample(5)
# Dropping 'Period', 'Percent Complete', 'Percent Pending Investigation', 'Footnote', 'Footnote symbol', 'Predicted Value',
# 'Year' and 'Month' as I won't be needing them anymore.
drug_overdose.drop(columns=['Period', 'Percent Complete', 'Percent Pending Investigation', 'Footnote', 'Footnote Symbol', 'Predicted Value', 'Year', 'Month'], inplace= True)
drug_overdose.sample(5)
# Finding unique values within 'Indicator' that I will be dropping except, 'Number of Drug Overdose Deaths'.
drug_overdose['Indicator'].unique()
# Filtering and dropping rows that do not contain, 'Number of Drug Overdose Deaths'.
drug_filter = drug_overdose['Indicator'] == 'Number of Drug Overdose Deaths'
drug_overdose = drug_overdose[drug_filter]
drug_overdose.sample(5)
# Making sure that the null values have been dealt with. What I assumed was correct, they were in catergories I didnt' need.
drug_overdose.isnull().sum()
# Checking dates. These will need to match with the border_crossing dataset. Will drop rows accordingly.
drug_max = drug_overdose['Date'].max()
drug_min = drug_overdose['Date'].min()
print(drug_max)
print(drug_min)
drop_states = [ 'Alabama', 'Arkansas', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
'Hawaii', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
'Maryland', 'Massachusetts', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada',
'New Hampshire', 'New Jersey', 'New York City', 'North Carolina', 'Ohio', 'Oklahoma', 'Oregon',
'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
'Utah', 'Virginia', 'West Virginia', 'Wisconsin','Wyoming', 'United States']
drug_overdose = drug_overdose[drug_overdose.State.isin(drop_states) == False]
drug_overdose['State'].unique()
# Making sure that the states listed above match the border_filtered dataset.
border_sorted = sorted(border_filtered['State'].unique())
border_sorted
# Will no longer be needing 'Indicator' column as, 'Data Value' column is synonomus with it.
drug_overdose.drop(columns=['Indicator'], inplace= True)
drug_overdose.sample(5)
# Renaming'Data Value' with 'Drug Overdose Deaths'. This helps keeps things tidy when merging the datasets.
drug_overdose = drug_overdose.rename(columns={'Data Value': 'Drug Overdose Deaths'})
drug_overdose.sample(5)
# Checking to make sure data types are correct. I will be changing 'Drug Overdose Deaths' column to int64.
drug_overdose.info()
# I keep running into an error when trying to change the 'Drug Overdose Deaths' column into int64. The error has
# to do with there being commas in the values. Now I'm curious how many there actually are.
comma_count = drug_overdose['Drug Overdose Deaths'].str.contains(',').sum()
print(comma_count)
# Now that I know there a lot of commas, I'm going to get rid of them. They are not needed and I need to convert
# the 'Drug Overdose Deaths' column.
drug_overdose['Drug Overdose Deaths'] = drug_overdose['Drug Overdose Deaths'].str.replace(',', '')
drug_overdose['Drug Overdose Deaths'] = drug_overdose['Drug Overdose Deaths'].astype('int64')
drug_overdose.info()
final_border = pd.merge(border_pivot, drug_overdose, on=['State', 'Date'])
final_border.sample(10)
# Making sure that merge() worked and there are no NaN values. Success!
final_border.isna().sum()
final_border.info()
final_check = final_border.loc[(final_border['State'] == 'Texas') & (final_border['Date'] == '2018-07-01')]
print(final_check)
For the descriptive statistics part of this project I wanted to really understand the spread of what drug overdoses and vehicle frequency look like in states that border the U.S. Mexico and U.S. Canada border. You will see that they are broken up by drug overdoses and vehicle frequency and then what they look like together. I started out using describe() on the 'Drug Overdose Deaths' column, which gives a basic overview of elementary statistics. The average amount of drug overdose deaths per month is around ~1501 and the max was ~12000. Below are more visualizations that will help to understand what drug overdose deaths in border states look like.
final_border['Drug Overdose Deaths'].describe()
DrugOD_column = final_border['Drug Overdose Deaths']
plt.figure(figsize=(17,8))
sns.kdeplot(DrugOD_column)
plt.figure(figsize=(17,7))
sns.lineplot(x='Date', y='Drug Overdose Deaths', data=final_border)
plt.figure(figsize=(17,9))
final_border['Year'] = final_border['Date'].dt.year
sns.boxplot(x='Year', y='Drug Overdose Deaths', width=0.5, data=final_border)
plt.figure(figsize=(17,8))
sns.barplot(x='State', y='Drug Overdose Deaths', data=final_border)
final_border['Vehicle Total'].describe()
vehicle_freq = final_border['Vehicle Total']
plt.figure(figsize=(17,8))
sns.kdeplot(vehicle_freq)
plt.figure(figsize=(17,8))
plt.xticks(rotation=45)
sns.barplot(x='State', y='Vehicle Total', data=final_border)
plt.figure(figsize=(17,7))
sns.lineplot(x='Date', y='Vehicle Total', data=final_border)
plt.figure(figsize=(17,9))
sns.boxenplot(data=final_border, x="Vehicle Total", y="State")
# Defining the scaler.
scaler = MinMaxScaler()
# Fitting the scaler and transforming two columns.
final_border[['Vehicle Total', 'Drug Overdose Deaths']] = scaler.fit_transform(final_border[['Vehicle Total',
'Drug Overdose Deaths']])
final_border.sample(5)
plt.figure(figsize=(17,7))
scaled_DrugOD_column = final_border['Drug Overdose Deaths']
scaled_vehicle_total = final_border['Vehicle Total']
plt.figure(figsize=(17,8))
sns.kdeplot(scaled_DrugOD_column, fill='yes')
sns.kdeplot(scaled_vehicle_total, fill='yes')
plt.figure(figsize=(17,7))
# Using line plot and getting rid of the confidence Interval.
sns.set_style('darkgrid')
sns.lineplot(x='Date', y='Drug Overdose Deaths', data=final_border, errorbar=None, color='green')
sns.lineplot(x='Date', y='Vehicle Total', data=final_border, errorbar=None, color='orange')
plt.figure(figsize=(17,9))
sns.scatterplot(data=final_border, x="Drug Overdose Deaths", y="Vehicle Total", hue="State")
plt.figure(figsize=(17,9))
sns.scatterplot(data=final_border, x="Drug Overdose Deaths", y="Vehicle Total", hue="State")
plt.xlim(0.0,0.4)
plt.ylim(0.0,0.4)
Correlation is the measurement of the strength of the relationship between two variables. Below I found correlation between drug overdose deaths and vehicle frequency, on a scale of -1 (neg relationship) and 1 (positive relationship), the correlation here falls at 0.8! There is a positive relationship, when vehicle total begins to rise, drug overdose deaths do as well.
correlation = final_border['Vehicle Total'].corr(final_border['Drug Overdose Deaths'])
print(correlation)
sns.lmplot(data=final_border, x="Drug Overdose Deaths", y="Vehicle Total", height=7)
Null Hypothesis:
$H_{0}$: The less vehicles that pass through the most frequented ports in each state, the less occurrence of overdose deaths in U.S. Mexico and U.S. Canda bordering states there will be.
Alternative Hypothesis:
$H_{A}$: The more vehicles that pass through the most frequented ports in each state, the larger the occurrence of overdose deaths in U.S. Mexico and U.S. Canda bordering states there will be.
Planned Statistical Test:
Linear Regression
Metrics grenerated from planned statistical test:
T-statistic, P-Value
Alpha Value (threshold):
$a$: 0.05%
if p-value is < 0.05 we reject the null hypothesis.
if p-value is > 0.05 we fail to reject the null hypothesis.
# Defining the independent and dependent variables. Using only the only two columns that are needed here.
x = final_border['Vehicle Total']
y = final_border['Drug Overdose Deaths']
# Adding a constant.
x_const = sm.add_constant(x)
# fit the model
regression_model = sm.OLS(y, x_const).fit()
# printing summary statistics
print(regression_model.summary())
# Trying plotly for the first time. I should have been using this all along.
fig = px.scatter(final_border, x="Drug Overdose Deaths", y="Vehicle Total", trendline="ols", trendline_color_override="orange")
fig.show()